In [78]:
import graphlab

In [79]:
graphlab.canvas.set_target('ipynb')

In [80]:
homeData= graphlab.SFrame('home_data.gl/')

In [81]:
homeData


Out[81]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront
7129300520 2014-10-13 00:00:00+00:00 221900 3 1 1180 5650 1 0
6414100192 2014-12-09 00:00:00+00:00 538000 3 2.25 2570 7242 2 0
5631500400 2015-02-25 00:00:00+00:00 180000 2 1 770 10000 1 0
2487200875 2014-12-09 00:00:00+00:00 604000 4 3 1960 5000 1 0
1954400510 2015-02-18 00:00:00+00:00 510000 3 2 1680 8080 1 0
7237550310 2014-05-12 00:00:00+00:00 1225000 4 4.5 5420 101930 1 0
1321400060 2014-06-27 00:00:00+00:00 257500 3 2.25 1715 6819 2 0
2008000270 2015-01-15 00:00:00+00:00 291850 3 1.5 1060 9711 1 0
2414600126 2015-04-15 00:00:00+00:00 229500 3 1 1780 7470 1 0
3793500160 2015-03-12 00:00:00+00:00 323000 3 2.5 1890 6560 2 0
view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat
0 3 7 1180 0 1955 0 98178 47.51123398
0 3 7 2170 400 1951 1991 98125 47.72102274
0 3 6 770 0 1933 0 98028 47.73792661
0 5 7 1050 910 1965 0 98136 47.52082
0 3 8 1680 0 1987 0 98074 47.61681228
0 3 11 3890 1530 2001 0 98053 47.65611835
0 3 7 1715 0 1995 0 98003 47.30972002
0 3 7 1060 0 1963 0 98198 47.40949984
0 3 7 1050 730 1960 0 98146 47.51229381
0 3 7 1890 0 2003 0 98038 47.36840673
long sqft_living15 sqft_lot15
-122.25677536 1340.0 5650.0
-122.3188624 1690.0 7639.0
-122.23319601 2720.0 8062.0
-122.39318505 1360.0 5000.0
-122.04490059 1800.0 7503.0
-122.00528655 4760.0 101930.0
-122.32704857 2238.0 6819.0
-122.31457273 1650.0 9711.0
-122.33659507 1780.0 8113.0
-122.0308176 2390.0 7570.0
[21613 rows x 21 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Question -1

Find the zip code with the highest average house sales price (from the earlier work sheet) and within the houses in that zip code, compute the average price


In [82]:
import graphlab.aggregate as agg
homeData.groupby(key_columns='zipcode',operations={'avg_sales_price' : agg.MEAN('price')})


Out[82]:
zipcode avg_sales_price
98033 803719.532407
98032 251296.24
98065 527961.203226
98077 682774.878788
98144 594547.641399
98136 551688.673004
98115 619900.5506
98075 790576.668524
98034 521652.858716
98058 353608.635165
[70 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

In [83]:
import numpy as np
np.average(homeData.filter_by(['98033'],'zipcode')['price'])


Out[83]:
803719.53240740742

In [84]:
def is_valid_home(sqft):
    return (sqft >2000) & (sqft <4000)

In [85]:
q2homes = homeData[homeData['sqft_living'].apply(lambda x : is_valid_home(x))]

In [86]:
len(q2homes)


Out[86]:
9111

In [87]:
len(homeData)


Out[87]:
21613

In [88]:
(len(q2homes)/float(len(homeData)))*100


Out[88]:
42.155184379771434

In [89]:
advanced_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
                     'condition', # condition of house
                     'grade', # measure of quality of construction
                     'waterfront', # waterfront property
                     'view', # type of view
                     'sqft_above', # square feet above ground
                     'sqft_basement', # square feet in basement
                     'yr_built', # the year built
                     'yr_renovated', # the year renovated
                     'lat', 'long', # the lat-long of the parcel
                     'sqft_living15', # average sq.ft. of 15 nearest neighbors 
                     'sqft_lot15' # average lot size of 15 nearest neighbors 
]

Create the test train split


In [90]:
train_data, test_data = homeData.random_split(0.7,seed=0)

In [91]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [92]:
my_features_model = graphlab.linear_regression.create(train_data,target = 'price',features=my_features)


PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 14542
PROGRESS: Number of features          : 6
PROGRESS: Number of unpacked features : 6
PROGRESS: Number of coefficients    : 115
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 0.029961     | 3727238.710196     | 1017359.987928       | 180643.719657 | 143067.739283   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+

In [93]:
adv_feature_model =  graphlab.linear_regression.create(train_data,target='price',features=advanced_features)


PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 14430
PROGRESS: Number of features          : 18
PROGRESS: Number of unpacked features : 18
PROGRESS: Number of coefficients    : 127
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 0.039397     | 3455720.025271     | 1406097.524156       | 152565.847787 | 137216.740082   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+

In [94]:
print my_features_model.evaluate(test_data)


{'max_error': 5280200.637821782, 'rmse': 197354.96729077716}

In [95]:
print adv_feature_model.evaluate(test_data)


{'max_error': 5097978.206195663, 'rmse': 175532.00695222418}

In [96]:
my_features_model.evaluate(test_data).get('rmse')- adv_feature_model.evaluate(test_data).get('rmse')


Out[96]:
21822.960338552977

In [97]:
train_data_1, test_data_1 = homeData.random_split(0.8,seed=0)
my_features_model_1 = graphlab.linear_regression.create(train_data_1,target = 'price',features=my_features)
adv_feature_model_1 =  graphlab.linear_regression.create(train_data_1,target='price',features=advanced_features)


PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 16455
PROGRESS: Number of features          : 6
PROGRESS: Number of unpacked features : 6
PROGRESS: Number of coefficients    : 115
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 0.030532     | 3743119.706075     | 2061742.555508       | 182260.544700 | 176889.975691   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 16480
PROGRESS: Number of features          : 18
PROGRESS: Number of unpacked features : 18
PROGRESS: Number of coefficients    : 127
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 0.037638     | 3474016.382253     | 1394980.351903       | 154597.533311 | 155621.449739   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+

In [98]:
print my_features_model_1.evaluate(test_data_1)
print adv_feature_model_1.evaluate(test_data_1)


{'max_error': 3464315.0413833335, 'rmse': 179142.60767619964}
{'max_error': 3558676.7762994887, 'rmse': 156717.7897720566}

In [99]:
my_features_model_1.evaluate(test_data_1).get('rmse')- adv_feature_model_1.evaluate(test_data_1).get('rmse')


Out[99]:
22424.81790414304

In [ ]: